#####################
#The purpose of this file is to load up two subdatasets and clean them for processing.
#
#####################
library(dplyr)
library(broom)

logit2prob <- function(logit){
  odds <- exp(logit)
  prob <- odds / (1 + odds)
  return(prob)
}

# Part 0 - set globals and make helpers
#basePath = '/home/kaylea/Research/taboo/'
basePath = '/gscratch/comdata/users/kaylea/taboo/'
#tabCoefPath = paste0(basePath, 'processed_data/tabooSample.tsv')
tabCoefPath = paste0(basePath, 'processed_data/euphSample.tsv')
ngramPath = paste0(basePath, 'processed_data/ngramSample.tsv')
dataPath = paste0(basePath, 'processed_data/')
ngramCatPath = paste0(dataPath, 'catlookups_NGram.tsv')
#tabCatPath = paste0(dataPath, 'catlookups_TCoef.tsv')
tabCatPath = paste0(dataPath, 'catlookups_Euph.tsv')

ngramCatDF <- read.table(ngramCatPath, header=TRUE, sep="\t", stringsAsFactors=FALSE, quote="") #quotes aren't escaped
ngramCatDF$full.category <- gsub('Category:', '', ngramCatDF$full.category) ## drop "Category:" from the front
ngramCatDF$full.category <- tolower(ngramCatDF$full.category)
ngramCatDF <- unique(ngramCatDF)

tabCatDF <- read.table(tabCatPath, header=TRUE, sep="\t", stringsAsFactors=FALSE, quote="")
tabCatDF$full.category <- gsub('Category:', '', tabCatDF$full.category) ## drop "Category:" from the front
tabCatDF$full.category <- tolower(tabCatDF$full.category)
tabCatDF <- unique(tabCatDF)

head(tabCatDF)
head(ngramCatDF)
tabCatDF$source <- 'taboo'
tabCatDF$taboo <- 1
ngramCatDF$source <- 'ngram'
ngramCatDF$taboo <- 0

catDF <- rbind(tabCatDF, ngramCatDF)
numCats <- length(unique(catDF$full.category))
catDF$full.category <- as.factor(catDF$full.category)


tabTab.CatFreq <- count(tabCatDF, full.category)
ngramTab.CatFreq <- count(ngramCatDF, full.category)

tabTab.CatFreq$prop <- round(tabTab.CatFreq$n/length(unique(tabCatDF$title))*100, 1) ## of those that had categories at all....
ngramTab.CatFreq$prop <- round(ngramTab.CatFreq$n/length(unique(ngramCatDF$title))*100, 1) ## ""

topTabCats <- tabTab.CatFreq %>% arrange(desc(prop)) %>% slice(1:15)
topNGramCats <- ngramTab.CatFreq %>% arrange(desc(prop)) %>% slice(1:15)

topTabCats$n <- NULL
topNGramCats$n <- NULL

colnames(topTabCats) <- c('Wikiproject-supplied Category', 'Percent')
colnames(topNGramCats) <- c('Wikiproject-supplied Category', 'Percent')

topTabCats
topNGramCats

###########################3

relDF <- subset(catDF, catDF$full.category=="wikiproject religion articles")
relDF$isRel <- 1 #default is yes
relDF$core.category <- NULL
relDF$full.category <- NULL
polDF <- subset(catDF, catDF$full.category=="wikiproject politics articles")
polDF$isPol <- 1 
polDF$core.category <- NULL
polDF$full.category <- NULL
sexDF <- subset(catDF, catDF$full.category=="wikiproject sexology and sexuality articles")
sexDF$isSex <- 1 
sexDF$core.category <- NULL
sexDF$full.category <- NULL

artCatDF <- catDF
artCatDF$core.category <- NULL
artCatDF$full.category <- NULL
artCatDF <- unique(artCatDF) #one line per title

artCatDF <- merge(artCatDF, relDF, by=c('title', 'source', 'taboo'), all.x=TRUE)
artCatDF <- merge(artCatDF, sexDF, by=c('title', 'source', 'taboo'), all.x=TRUE)
artCatDF <- merge(artCatDF, polDF, by=c('title', 'source', 'taboo'), all.x=TRUE)
artCatDF[is.na(artCatDF)] <- 0 ## NAs into 0s

### these were only temp anyway
sexDF <- NULL
relDF <- NULL
polDF <- NULL


#catLogitM.rel <- glm(taboo ~ isRel, data=artCatDF, family="binomial") 
catLogitM.sex <- glm(taboo ~ isSex, data=artCatDF, family="binomial") 
#catLogitM.pol <- glm(taboo ~ isPol, data=artCatDF, family="binomial") 

#summary(catLogitM.rel)
summary(catLogitM.sex)
#summary(catLogitM.pol)


#catLogitM.psr <- glm(taboo ~ isPol + isSex + isRel, data=artCatDF, family="binomial") 
#summary(catLogitM.psr)

#pol.prob <- logit2prob(coef(catLogitM.psr)[1] + coef(catLogitM.psr)[2])
sex.prob <- logit2prob(coef(catLogitM.sex)[1] + coef(catLogitM.sex)[2])
#rel.prob <- logit2prob(coef(catLogitM.psr)[1] + coef(catLogitM.psr)[4])

#pol.pv <- tidy(catLogitM.psr)$p.value[2]
sex.pv <- tidy(catLogitM.sex)$p.value[2]
#rel.pv <- tidy(catLogitM.psr)$p.value[4]

#pol.est <- tidy(catLogitM.psr)$estimate[2]
sex.est <- tidy(catLogitM.sex)$estimate[2]
#rel.est <- tidy(catLogitM.psr)$estimate[4]

#pol.z <- tidy(catLogitM.psr)$statistic[2]
sex.z <- tidy(catLogitM.sex)$statistic[2]
#rel.z <- tidy(catLogitM.psr)$statistic[4]


